# title: "QC_Jan_2023_Seurat"
# output: html_document
# date: "2023-04-13"
library(SingleCellExperiment)
library(Seurat)
library(tidyverse)
library(Matrix)
library(scales)
library(cowplot)
library(RCurl)
make sure this script is in the same folder as the data
#setwd("~/OneDrive - University of Florida/Schnitzler_Lab/Jingwei/10X/Jan_2023")
# Create a Seurat object for each sample
for (file in c("BR4_lib1_filtered_feature_bc_matrix",
"BR4_lib2_filtered_feature_bc_matrix",
"BR5_lib1_filtered_feature_bc_matrix",
"BR5_lib2_filtered_feature_bc_matrix")){
seurat_data <- Read10X(data.dir = paste0("data/", file))
seurat_obj <- CreateSeuratObject(counts = seurat_data,
min.features = 100,
project = file)
assign(file, seurat_obj)
}
# Merge all samples into a single Seurat object
merged_seurat <- merge(x = BR4_lib1_filtered_feature_bc_matrix,
y = c(BR4_lib2_filtered_feature_bc_matrix,
BR5_lib1_filtered_feature_bc_matrix,
BR5_lib2_filtered_feature_bc_matrix),
add.cell.id = c("BR4_lib1", "BR4_lib2", "BR5_lib1", "BR5_lib2"))
# Check that the merged object has the appropriate sample-specific prefixes
head(merged_seurat@meta.data)
## orig.ident nCount_RNA
## BR4_lib1_AAACCCAAGAGATGCC-1 BR4_lib1_filtered_feature_bc_matrix 1355
## BR4_lib1_AAACCCAAGGTTGACG-1 BR4_lib1_filtered_feature_bc_matrix 723
## BR4_lib1_AAACCCAAGTGGTCAG-1 BR4_lib1_filtered_feature_bc_matrix 663
## BR4_lib1_AAACCCACAAACGTGG-1 BR4_lib1_filtered_feature_bc_matrix 841
## BR4_lib1_AAACCCACAAGAGGCT-1 BR4_lib1_filtered_feature_bc_matrix 810
## BR4_lib1_AAACCCACAGAAACCG-1 BR4_lib1_filtered_feature_bc_matrix 936
## nFeature_RNA
## BR4_lib1_AAACCCAAGAGATGCC-1 857
## BR4_lib1_AAACCCAAGGTTGACG-1 540
## BR4_lib1_AAACCCAAGTGGTCAG-1 463
## BR4_lib1_AAACCCACAAACGTGG-1 473
## BR4_lib1_AAACCCACAAGAGGCT-1 502
## BR4_lib1_AAACCCACAGAAACCG-1 621
tail(merged_seurat@meta.data)
## orig.ident nCount_RNA
## BR5_lib2_TTTGTTGCAGGACTTT-1 BR5_lib2_filtered_feature_bc_matrix 2104
## BR5_lib2_TTTGTTGGTAAGATCA-1 BR5_lib2_filtered_feature_bc_matrix 1968
## BR5_lib2_TTTGTTGGTCAAATCC-1 BR5_lib2_filtered_feature_bc_matrix 1951
## BR5_lib2_TTTGTTGGTTACCCTC-1 BR5_lib2_filtered_feature_bc_matrix 806
## BR5_lib2_TTTGTTGTCAAATAGG-1 BR5_lib2_filtered_feature_bc_matrix 3390
## BR5_lib2_TTTGTTGTCATTTGGG-1 BR5_lib2_filtered_feature_bc_matrix 924
## nFeature_RNA
## BR5_lib2_TTTGTTGCAGGACTTT-1 1119
## BR5_lib2_TTTGTTGGTAAGATCA-1 1074
## BR5_lib2_TTTGTTGGTCAAATCC-1 1190
## BR5_lib2_TTTGTTGGTTACCCTC-1 556
## BR5_lib2_TTTGTTGTCAAATAGG-1 1600
## BR5_lib2_TTTGTTGTCATTTGGG-1 635
### QC ###
# Add number of genes per UMI for each cell to metadata
merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)
# Compute percent mito ratio
merged_seurat$mitoRatio <- PercentageFeatureSet(object = merged_seurat, pattern = "^HyS0613.")
merged_seurat$mitoRatio <- merged_seurat@meta.data$mitoRatio / 100
# Create metadata dataframe
metadata <- merged_seurat@meta.data
# Create sample column
metadata$sample <- NA
metadata$sample <- rep("Methanol_New", dim(metadata)[1])
# Rename columns, "new.name = old.name"
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident, # seq_folder
nUMI = nCount_RNA,
nGene = nFeature_RNA)
# Add metadata back to Seurat object
merged_seurat@meta.data <- metadata
tail(merged_seurat@meta.data)
## seq_folder nUMI nGene
## BR5_lib2_TTTGTTGCAGGACTTT-1 BR5_lib2_filtered_feature_bc_matrix 2104 1119
## BR5_lib2_TTTGTTGGTAAGATCA-1 BR5_lib2_filtered_feature_bc_matrix 1968 1074
## BR5_lib2_TTTGTTGGTCAAATCC-1 BR5_lib2_filtered_feature_bc_matrix 1951 1190
## BR5_lib2_TTTGTTGGTTACCCTC-1 BR5_lib2_filtered_feature_bc_matrix 806 556
## BR5_lib2_TTTGTTGTCAAATAGG-1 BR5_lib2_filtered_feature_bc_matrix 3390 1600
## BR5_lib2_TTTGTTGTCATTTGGG-1 BR5_lib2_filtered_feature_bc_matrix 924 635
## log10GenesPerUMI mitoRatio sample
## BR5_lib2_TTTGTTGCAGGACTTT-1 0.9174806 0.0000000000 Methanol_New
## BR5_lib2_TTTGTTGGTAAGATCA-1 0.9201522 0.0005081301 Methanol_New
## BR5_lib2_TTTGTTGGTCAAATCC-1 0.9347436 0.0000000000 Methanol_New
## BR5_lib2_TTTGTTGGTTACCCTC-1 0.9445142 0.0000000000 Methanol_New
## BR5_lib2_TTTGTTGTCAAATAGG-1 0.9076314 0.0000000000 Methanol_New
## BR5_lib2_TTTGTTGTCATTTGGG-1 0.9450721 0.0010822511 Methanol_New
# OPTIONAL:
#save(merged_seurat, file="data/merged_seurat_step01.RData")
# Visualize the number of cell counts per sample
merged_seurat@meta.data %>%
ggplot(aes(x=sample, fill=sample)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NCells")
# Visualize the number UMIs/transcripts per cell
merged_seurat@meta.data %>%
ggplot(aes(color=sample, x=nUMI, fill= sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 300)
# Visualize the distribution of genes detected per cell via histogram
merged_seurat@meta.data %>%
ggplot(aes(color=sample, x=nGene, fill= sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = 100)
# Visualize the overall complexity of the gene expression by visualizing the genes detected per UMI (novelty score)
merged_seurat@meta.data %>%
ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
# Visualize the distribution of mitochondrial gene expression detected per cell
merged_seurat@meta.data %>%
ggplot(aes(color=sample, x=mitoRatio, fill=sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 18326 rows containing non-finite values (`stat_density()`).
# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs
merged_seurat@meta.data %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~sample)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# This cutoff to match with ACME
filtered_seurat <- subset(x = merged_seurat,
subset= (nUMI >= 300& nUMI <=100000) &
(nGene >= 100 & nGene <7000) &
(log10GenesPerUMI > 0.80) &
(mitoRatio < 0.20))
filtered_seurat <- SCTransform(filtered_seurat, vst.flavor = "v2") %>%
RunPCA(npcs = 50) %>%
RunUMAP(dims = 1:15) %>%
FindNeighbors(reduction = "pca", dims = 1:15) %>%
FindClusters(resolution = 0.5)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 29291
## Number of edges: 1011070
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9394
## Number of communities: 22
## Elapsed time: 4 seconds
#
DimPlot(filtered_seurat,label = T) + NoLegend() + NoAxes() # similar to what's seen before
# # suspect C2 is artefact cluster
# removed C2
no_C2 <- subset(filtered_seurat, idents = "2", invert=T)
DimPlot(no_C2)
rerun UMAP and recluster
no_C2 <- SCTransform(no_C2, vst.flavor = "v2") %>%
RunPCA(npcs = 50) %>%
RunUMAP(dims = 1:15) %>%
FindNeighbors(reduction = "pca", dims = 1:15) %>%
FindClusters(resolution = 0.5)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|======== | 11%
|
|========== | 14%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|================== | 26%
|
|==================== | 29%
|
|====================== | 31%
|
|======================== | 34%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 71%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 26366
## Number of edges: 927969
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9463
## Number of communities: 21
## Elapsed time: 3 seconds
DimPlot(no_C2,label = T) + NoLegend() + NoAxes()
FeaturePlot(no_C2, features = c("HyS0018.100", "HyS0061.60", #HMG, PCNA
"HyS0008.263", "HyS0030.203", #Ncol1, Nematocilin
"HyS0041.99", "HyS0004.446", #chitinase,mucs
"HyS0085.53", "HyS0013.338", #ELAV, Rfamide
"HyS0017.253", "HyS0010.323",
"HyS0001.667", "HyS0011.54"), #Wnt5, Pitx
order = T, min.cutoff = 'q10')
## Warning: Could not find HyS0001.667 in the default search locations, found in
## RNA assay instead
## Warning in FeaturePlot(no_C2, features = c("HyS0018.100", "HyS0061.60", : All
## cells have the same value (0) of rna_HyS0001.667.
#save(no_C2, file="output/filtered_seurat_PC50_dims15_res0.5_wo_artefact.RData")
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RCurl_1.98-1.10 cowplot_1.1.1
## [3] scales_1.2.1 Matrix_1.5-3
## [5] lubridate_1.9.2 forcats_1.0.0
## [7] stringr_1.5.0 dplyr_1.1.0
## [9] purrr_1.0.1 readr_2.1.4
## [11] tidyr_1.3.0 tibble_3.2.0
## [13] ggplot2_3.4.1 tidyverse_2.0.0
## [15] SeuratObject_4.1.3 Seurat_4.3.0
## [17] SingleCellExperiment_1.18.1 SummarizedExperiment_1.26.1
## [19] Biobase_2.56.0 GenomicRanges_1.48.0
## [21] GenomeInfoDb_1.32.4 IRanges_2.30.1
## [23] S4Vectors_0.34.0 BiocGenerics_0.42.0
## [25] MatrixGenerics_1.8.1 matrixStats_0.63.0
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.8 igraph_1.4.1
## [3] lazyeval_0.2.2 sp_1.6-0
## [5] splines_4.2.1 listenv_0.9.0
## [7] scattermore_0.8 digest_0.6.31
## [9] htmltools_0.5.4 fansi_1.0.4
## [11] magrittr_2.0.3 tensor_1.5
## [13] cluster_2.1.4 ROCR_1.0-11
## [15] tzdb_0.3.0 globals_0.16.2
## [17] R.utils_2.12.2 timechange_0.2.0
## [19] spatstat.sparse_3.0-1 colorspace_2.1-0
## [21] ggrepel_0.9.3 xfun_0.37
## [23] jsonlite_1.8.4 progressr_0.13.0
## [25] spatstat.data_3.0-1 survival_3.5-5
## [27] zoo_1.8-11 glue_1.6.2
## [29] polyclip_1.10-4 gtable_0.3.1
## [31] zlibbioc_1.42.0 XVector_0.36.0
## [33] leiden_0.4.3 DelayedArray_0.22.0
## [35] future.apply_1.10.0 abind_1.4-5
## [37] DBI_1.1.3 spatstat.random_3.1-4
## [39] miniUI_0.1.1.1 Rcpp_1.0.10
## [41] viridisLite_0.4.1 xtable_1.8-4
## [43] reticulate_1.28 htmlwidgets_1.6.1
## [45] httr_1.4.5 RColorBrewer_1.1-3
## [47] ellipsis_0.3.2 ica_1.0-3
## [49] farver_2.1.1 R.methodsS3_1.8.2
## [51] pkgconfig_2.0.3 sass_0.4.5
## [53] uwot_0.1.14 deldir_1.0-6
## [55] utf8_1.2.3 labeling_0.4.2
## [57] tidyselect_1.2.0 rlang_1.1.0
## [59] reshape2_1.4.4 later_1.3.0
## [61] munsell_0.5.0 tools_4.2.1
## [63] cachem_1.0.7 cli_3.6.0
## [65] generics_0.1.3 ggridges_0.5.4
## [67] evaluate_0.20 fastmap_1.1.1
## [69] yaml_2.3.7 goftest_1.2-3
## [71] knitr_1.42 fitdistrplus_1.1-8
## [73] RANN_2.6.1 sparseMatrixStats_1.8.0
## [75] pbapply_1.7-0 future_1.32.0
## [77] nlme_3.1-162 mime_0.12
## [79] R.oo_1.25.0 compiler_4.2.1
## [81] rstudioapi_0.14 plotly_4.10.1
## [83] png_0.1-8 spatstat.utils_3.0-2
## [85] glmGamPoi_1.8.0 bslib_0.4.2
## [87] stringi_1.7.12 highr_0.10
## [89] lattice_0.20-45 vctrs_0.5.2
## [91] pillar_1.8.1 lifecycle_1.0.3
## [93] spatstat.geom_3.1-0 lmtest_0.9-40
## [95] jquerylib_0.1.4 RcppAnnoy_0.0.20
## [97] data.table_1.14.8 bitops_1.0-7
## [99] irlba_2.3.5.1 httpuv_1.6.9
## [101] patchwork_1.1.2 R6_2.5.1
## [103] promises_1.2.0.1 KernSmooth_2.23-20
## [105] gridExtra_2.3 parallelly_1.34.0
## [107] codetools_0.2-19 MASS_7.3-58.3
## [109] withr_2.5.0 sctransform_0.3.5
## [111] GenomeInfoDbData_1.2.8 mgcv_1.8-42
## [113] parallel_4.2.1 hms_1.1.2
## [115] grid_4.2.1 DelayedMatrixStats_1.18.2
## [117] rmarkdown_2.20 Rtsne_0.16
## [119] spatstat.explore_3.1-0 shiny_1.7.4